Data

data <- readRDS("~/peregrine_amazon/data/annual/aad_2021_forests.rds") %>% 
  # select only general and forest variables
  select(Code:Population, min_Precip:max_Precip_Month, 
         forest_density, SWChange_abs:lat, -AvgRad, -StableLights) %>% 
  mutate(CL = ifelse(CL > 0, 1, 0) %>% 
           as.factor()) %>% 
  filter(!is.na(CL),
         !is.na(forest_density)) %>% 
  mutate(Country = as.factor(Country))

predictor_names <- data %>% 
  select(-c(Code, Name, Country, CL, fold)) %>% 
  names()

Spaciotemporal CV

We try different approaches to troubleshoot problems that we may encounter. The basic structure of each approach is simple. Use a spatiotemporal cross validation approach to test the model on a linear regression model. For every spatiotemporal fold \(ij\), we use all data outside of the fold \(ij\) as the training data and test it on the data inside of the \(ij\) fold. Then we gather the optimal threshold using a ROAUC curve on some validation data and finally test it on the untouched testing data.

Naive approach

Below, we just create a model on all the data without any spatiotemporal cross validation to see the top explanatory variables and compare with each approach.

set.seed(123)

full.predictor_names <- data %>% 
  select(-c(Name, Country, Code, CL, fold)) %>% 
  names()

full.ml_formula <- as.formula(paste0("CL ~", paste(full.predictor_names, collapse=" + ")))

# get testing and training split
split <- initial_split(data, strata = CL)
full.datrain <- training(split)
full.datest <- testing(split)

# get model
full.mod <- glm(full.ml_formula, data=full.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on full.datest
full.oob <- predict(full.mod, newdata=full.datest, type='response', na.action = na.pass)

full.temp_auc_out <- pROC::roc(response = full.datest$CL, predictor= full.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
full.best_threshold_out <- pROC::coords(full.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
full.metrica.format <- data.frame(
  cbind(ifelse(full.datest$CL==1, 1, 0),
        ifelse(full.oob>=full.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(full.metrica.format) <- c("labels","predictions")
rownames(full.metrica.format) <- 1:dim(full.metrica.format)[1]

full.auc_out <- pROC::roc(response = full.metrica.format$labels, predictor= full.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
full.metrica.format.confusionMatrix <- full.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=full.metrica.format$labels)

full.sensitivity_out <- caret::recall(
  data = full.metrica.format.confusionMatrix$table, 
  relevant = rownames(full.metrica.format.confusionMatrix$table)[2]
) 
full.specificity_out <- caret::precision(
  data = full.metrica.format.confusionMatrix$table,
  relevant = rownames(full.metrica.format.confusionMatrix$table)[2]
)

full.var_imp_vars <- vip::vip(full.mod, num_features=20)
full.var_imp_vars

It seems that the socioeconomic and elevation variables sit at the top of the importance plot, as well as temperature and precipitation. We should take this with a grain of salt as it’s not terribly rigorous.

Now we attempt to run a bare-bones spatiotemporal cross validation approach. Here, we make sure that each testing fold analyzed has both present and absent points. Then we take and split the training data to create a validation set for model tuning in order to save the testing set datest for the end. Testing on the datrain.test data, we get a set of most important predictors and optimal threshold to use on datest. We finally save the metrics to aggregate later before moving onto the next spatiotemporal fold.

set.seed(123)
for(j in data$Year %>% unique() %>% sort()){
  for(i in data$fold %>% unique() %>% sort()){
    print(paste(i,j, sep="_"))
    # first split using i and j
    datrain <- data %>% 
      filter(Year != j,
             fold != i)
    datest <- data %>% 
      filter(Year == j,
             fold == i)
    
    # skip fold if there is a complete imbalance of present vs. absent CL
    if(datest$CL %>% unique() %>% length() != 2){next}
    
    # second split (on datrain) before testing on datest for further validation
    datrain.split <- initial_split(datrain)
    datrain.train <- training(datrain.split)
    datrain.test <- testing(datrain.split)
  
    # get initial model
    init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
    
    # calculate out of sample model performance on datrain.test
    oob.datrain.test <- predict(full.mod, newdata=datrain.test, type='response') 
    auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
    best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datrain.test <- data.frame(
      cbind(ifelse(datrain.test$CL==1, 1, 0),
            ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    
    colnames(metrica.format.datrain.test) <- c("labels","predictions")
    rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
    
   metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
    
    sensitivity_out.datrain.test <- caret::recall(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    ) 
    specificity_out.datrain.test <- caret::precision(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    )
    
    var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable
    
    # for retrieval later
    datrain.test.metrics <- list(
      auc = auc_out.datrain.test,
      best_threshold = best_threshold_out.datrain.test,
      confusion_matrix = metrica.format.datrain.test,
      sensitivity = sensitivity_out.datrain.test,
      specificity = specificity_out.datrain.test,
      imp_vars = var_imp_vars.datrain.test)

    edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
    ## second model on all of datrain
    edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
    edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
    
    oob.datest <- predict(edited.mod, newdata=datest, type='response')
    
    auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
    
    best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datest <- data.frame(
      cbind(ifelse(datest$CL==1, 1, 0),
            ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    colnames(metrica.format.datest) <- c("labels","predictions")
    rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
    
    metrica.format.datest <- metrica.format.datest$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datest$labels)
    
    sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table, 
                                                  relevant = rownames(metrica.format.datest$table)[2]) 
    specificity_out.datest <- caret::precision(data = metrica.format.datest$table, 
                                                     relevant = rownames(metrica.format.datest$table)[2])
    
    var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
    
    # for retrieval later
    datest.metrics <- list(
      auc = auc_out.datest,
      best_threshold = best_threshold_out.datest,
      confusion_matrix = metrica.format.datest,
      sensitivity = sensitivity_out.datest,
      specificity = specificity_out.datest,
      imp_vars = var_imp_vars.datest)
    
    saveRDS(datrain.test.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datrain.test/", i, "_", j, "_metrics.rds"))
    saveRDS(datest.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
    saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models/", i, "_", j, "_edited_mod.rds"))
    }
}

Random resampling approach

For the most part, our code does not change. But first, we need to redefine the data that we partition into datrain and datest for each spatiotemporal fold:

set.seed(123)

# create holdout set
balanced.holdout.split <- initial_split(data, strata=fold)
balanced.holdout.train <- training(balanced.holdout.split)
balanced.holdout.test <- testing(balanced.holdout.split)

balanced_data_v1 <- balanced.holdout.train %>% 
  filter(CL == 1) %>% 
  slice_sample(n = nrow(balanced.holdout.train %>% filter(CL == 0))) %>% 
  rbind(balanced.holdout.train %>% filter(CL == 0))

dim(data)
## [1] 26213    41
dim(balanced_data_v1)
## [1] 11048    41

Compare the dimensions of the full data and the balanced data. The balanced training data is almost a third of the full data! The following code block uses the same structure as the one above using the full data.

resume_year = 2001
for(j in resume_year:max(balanced_data_v1$Year)){
  for(i in balanced_data_v1$fold %>% unique() %>% sort()){
    print(paste(i,j, sep="_"))
    # first split using i and j
    datrain <- balanced_data_v1 %>% 
      filter(Year != j,
             fold != i)
    datest <- balanced_data_v1 %>% 
      filter(Year == j,
             fold == i)
    # if the data is too imbalanced, then we skip to the next iteration i_j
    if(datest$CL %>% unique() %>% length() == 1) {next}
    
    # second split (on datrain) before testing on datest
    datrain.split <- initial_split(datrain)
    datrain.train <- training(datrain.split)
    datrain.test <- testing(datrain.split)
  
    # get initial model
    init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
    
    # calculate out of sample model performance on datrain.test
    oob.datrain.test <- predict(init.mod, newdata=datrain.test, type='response') 
    auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
    best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datrain.test <- data.frame(
      cbind(ifelse(datrain.test$CL==1, 1, 0),
            ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    
    colnames(metrica.format.datrain.test) <- c("labels","predictions")
    rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
    
   metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
    
    sensitivity_out.datrain.test <- caret::recall(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    ) 
    specificity_out.datrain.test <- caret::precision(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    )
    
    var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable
    
    
    
    # for retrieval later
    datrain.test.metrics <- list(
      auc = auc_out.datrain.test,
      best_threshold = best_threshold_out.datrain.test,
      confusion_matrix = metrica.format.datrain.test,
      sensitivity = sensitivity_out.datrain.test,
      specificity = specificity_out.datrain.test,
      imp_vars = var_imp_vars.datrain.test)
    
    edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
    ## second model on all of datrain
    edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
    edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
    
    oob.datest <- predict(edited.mod, newdata=datest, type='response')
    
    auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
    
    best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datest <- data.frame(
      cbind(ifelse(datest$CL==1, 1, 0),
            ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    colnames(metrica.format.datest) <- c("labels","predictions")
    rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
    
    metrica.format.datest <- metrica.format.datest$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datest$labels)
    
    sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table, 
                                                  relevant = rownames(metrica.format.datest$table)[2]) 
    specificity_out.datest <- caret::precision(data = metrica.format.datest$table, 
                                                     relevant = rownames(metrica.format.datest$table)[2])
    
    var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
    
    # for retrieval later
    datest.metrics <- list(
      auc = auc_out.datest,
      best_threshold = best_threshold_out.datest,
      confusion_matrix = metrica.format.datest,
      sensitivity = sensitivity_out.datest,
      specificity = specificity_out.datest,
      imp_vars = var_imp_vars.datest)
    
    saveRDS(datrain.test.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datrain.test/", i, "_", j, "_metrics.rds"))
    saveRDS(datest.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
    saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models/", i, "_", j, "_edited_mod.rds"))
    }
}

Maybe we can further choose a better resampling method since we skip a lot of iterations.

Resampling method v2

What if we go into each spatiotemporal fold \(ij\) and take $n_{ij} = $ number of observations with absent incidence in that spatiotemporal fold? Note that \(n_{ij} \geq 0\).

set.seed(123)
balanced_data_v2 <- data.frame()

for(j in balanced.holdout.train$Year %>% unique() %>% sort()){
  for(i in balanced.holdout.train$fold %>% unique() %>% sort()){
    # get ij fold subset
    data_ij <- balanced.holdout.train %>% 
      filter(Year == j,
             fold == i)
    
    # get subset of ij fold subset of only observations with no CL occurrence 
    data_ij_absent <- data_ij %>% 
      filter(CL == 0)
    
    # get the length to use when sampling to balance the data set
    n_ij <- nrow(data_ij_absent)
    
    if(n_ij == 0){next}
    
    # sample n_ij positive occurrence observations in ij fold
    n_data_ij_positive <- data_ij %>% 
      filter(CL == 1) %>% 
      nrow()
    
    if(n_data_ij_positive == 0){next}
    
    n_ij <- min(n_ij, n_data_ij_positive)
    
    data_ij_positive_sample <- data_ij %>% 
      filter(CL == 1) %>% 
      slice_sample(n=n_ij)
    
    data_ij_sample <- data_ij_positive_sample  %>% 
      rbind(data_ij_absent) # combine with subset with no CL occurrence
    
    balanced_data_v2 <- balanced_data_v2 %>% 
      rbind(data_ij_sample) # combine with main df balance_data_v2
    
    print(paste(j, i, n_ij, nrow(data_ij_positive_sample)))
  }
}
## [1] "2001 1 8 8"
## [1] "2001 2 13 13"
## [1] "2001 3 117 117"
## [1] "2002 1 4 4"
## [1] "2002 2 6 6"
## [1] "2002 3 108 108"
## [1] "2003 1 3 3"
## [1] "2003 2 3 3"
## [1] "2003 3 111 111"
## [1] "2004 1 5 5"
## [1] "2004 2 7 7"
## [1] "2004 3 126 126"
## [1] "2005 1 3 3"
## [1] "2005 2 4 4"
## [1] "2005 3 100 100"
## [1] "2006 1 3 3"
## [1] "2006 2 1 1"
## [1] "2006 3 115 115"
## [1] "2007 1 33 33"
## [1] "2008 1 22 22"
## [1] "2009 1 23 23"
## [1] "2009 2 8 8"
## [1] "2009 3 135 135"
## [1] "2010 1 380 380"
## [1] "2011 1 373 373"
## [1] "2012 1 359 359"
## [1] "2013 1 351 351"
## [1] "2014 1 370 370"
## [1] "2015 1 331 331"
## [1] "2016 1 35 35"
## [1] "2016 2 8 8"
## [1] "2016 3 186 186"
## [1] "2017 1 28 28"
## [1] "2017 2 5 5"
## [1] "2017 3 182 182"
## [1] "2018 1 23 23"
## [1] "2018 2 8 8"
## [1] "2018 3 169 169"
## [1] "2019 1 28 28"
## [1] "2019 2 8 8"
## [1] "2019 3 154 154"
## [1] "2020 1 28 28"
## [1] "2020 2 7 7"
## [1] "2020 3 145 145"

Let’s save this balanced data for later. For now, we get our ideal threshold and predictor variables from balanced_data_v1. To get our threshold, we just take the mean of our thresholds provided by each fold. To get our predictor variables, we count each time the predictor variable appears in all folds.

# create empty vectors to be filled
threshold <- c()
imp.vars <- c()
auc <- c()
p_val <- c()

resume_year <- 2001

for(j in resume_year:max(balanced_data_v1$Year)){
  for(i in balanced_data_v1$fold %>% unique() %>% sort()){
    # make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
    if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))) {next} 
    
    this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
    
    threshold <- c(threshold, this.metrics$best_threshold[[1]])
    imp.vars <- c(imp.vars, this.metrics$imp_vars)
    auc <- c(auc, this.metrics$auc$auc[[1]])
    p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
    
  }
}

# get mean threshold
mean.threshold <- mean(threshold)

# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
# get count of each variable to rank general importance to apply to main data
v1.imp.vars.count <- imp.vars.df %>% 
  count(var) %>% 
  arrange(desc(n))

# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)

metrics_df.balanced_v1 <- data.frame(
  metric = c('AUC', 'Accuracy P-value', 'Threshold'), 
  value = c(mean.auc, mean.p_val, mean.threshold)
)

metrics_df.balanced_v1
##             metric     value
## 1              AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3        Threshold 0.8011194

This approach gives a pretty high accuracy p-value (0.52) but decent AUC value (0.76).

On threshold

We can probably do better by weighting the threshold by a performance metric like no information rate (NIR) or P-value [Acc > NIR] (this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]).

On variable importance

This method by itself can be improved by giving a weight, perhaps weighting by order of importance in each fold or a performance metric. (See above.)

For now, we can carry on with this approach and see how well it performs. We choose the 10 most popular predictor variables to use in our formula and train a model on the testing data. [TODO: continue]

#### change v1.datrain/v1.datest -> balanced.holdout.train/v1.datest

set.seed(123)
# choose the first 10 predictor variables
v1.imp_vars <- v1.imp.vars.count %>% 
  select(var) %>% 
  filter(row_number() <= 10) %>% 
  mutate(var = as.character(var))

# get our predictor names into vector format but make sure to include the forest variables
v1.predictor_names <- c(v1.imp_vars$var, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

# get our threshold
v1.threshold <- metrics_df.balanced_v1 %>% 
  filter(metric=='Threshold') %>% 
  select(value) %>% 
  as.numeric()

# make the formula based on our chosen predictors
v1.ml_formula <- as.formula(paste0("CL ~", paste(v1.predictor_names, collapse=" + ")))

# get testing and training split
# split <- initial_split(data, strata = CL)
# v1.datrain <- training(split)
# v1.datest <- testing(split)

v1.datrain <- balanced.holdout.train
v1.datest <- balanced.holdout.test

# get model
v1.mod <- glm(v1.ml_formula, data=v1.datrain, family=binomial) ##
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on v1.datest
v1.oob <- predict(v1.mod, newdata=v1.datest, type='response', na.action = na.pass)

v1.temp_auc_out <- pROC::roc(response = v1.datest$CL, predictor= v1.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v1.best_threshold_out <- pROC::coords(v1.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
v1.metrica.format <- data.frame(
  cbind(ifelse(v1.datest$CL==1, 1, 0),
        ifelse(v1.oob>=v1.best_threshold_out[1,1], 1, 0))) %>% # v1.best_threshold_out[1,1] -> 0.8243; 
  mutate_all(as.factor)

colnames(v1.metrica.format) <- c("labels","predictions")
rownames(v1.metrica.format) <- 1:dim(v1.metrica.format)[1]

v1.auc_out <- pROC::roc(response = v1.metrica.format$labels, predictor= v1.metrica.format$predictions %>% as.ordered(), levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v1.metrica.format.confusionMatrix <- v1.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=v1.metrica.format$labels)

v1.sensitivity_out <- caret::recall(
  data = v1.metrica.format.confusionMatrix$table, 
  relevant = rownames(v1.metrica.format.confusionMatrix$table)[2]
) 
v1.specificity_out <- caret::precision(
  data = v1.metrica.format.confusionMatrix$table, 
  relevant = rownames(v1.metrica.format.confusionMatrix$table)[2]
)

v1.var_imp_vars <- vip::vip(v1.mod, num_features = 20)
v1.var_imp_vars

The top variables are precipitation, elevation, and socioeconomic again.

Partial dependence plots

v1.partial.forest_density <- pdp::partial(v1.mod, pred.var='forest_density')
pdp::plotPartial(v1.partial.forest_density)

v1.partial.forest_fragmentation <- pdp::partial(v1.mod, pred.var='forest_fragmentation')
pdp::plotPartial(v1.partial.forest_fragmentation)

v1.partial.land_use_change <- pdp::partial(v1.mod, pred.var='land_use_change')
pdp::plotPartial(v1.partial.land_use_change)

v1.partial.edge_loss <- pdp::partial(v1.mod, pred.var='edge_loss')
pdp::plotPartial(v1.partial.edge_loss)

v1.partial.Tair_f_tavg <- pdp::partial(v1.mod, pred.var='Tair_f_tavg')
pdp::plotPartial(v1.partial.Tair_f_tavg)

v1.partial.min_Precip <- pdp::partial(v1.mod, pred.var='min_Precip')
pdp::plotPartial(v1.partial.min_Precip)

summary(v1.mod)
## 
## Call:
## glm(formula = v1.ml_formula, family = binomial, data = v1.datrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.2569  -0.2023   0.3611   0.6041   4.3589  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.381e+01  9.227e+00  -1.496 0.134620    
## min_Elevation        -1.456e-03  7.708e-05 -18.883  < 2e-16 ***
## Population            4.606e-05  2.081e-06  22.130  < 2e-16 ***
## min_Precip           -1.971e-02  8.266e-04 -23.843  < 2e-16 ***
## forest_density        1.051e-02  1.191e-03   8.826  < 2e-16 ***
## HNTL                 -1.413e-01  8.798e-03 -16.059  < 2e-16 ***
## Wind_f_tavg          -1.921e-01  3.164e-02  -6.070 1.28e-09 ***
## max_Elevation         2.279e-05  5.490e-05   0.415 0.678062    
## LST_Night             6.584e-02  1.592e-02   4.137 3.52e-05 ***
## long                 -3.964e-07  3.462e-08 -11.450  < 2e-16 ***
## Year                  6.149e-03  4.617e-03   1.332 0.182915    
## forest_fragmentation  6.327e-07  6.725e-07   0.941 0.346839    
## land_use_change       7.694e-03  2.953e-02   0.261 0.794467    
## edge_loss             5.090e-07  1.470e-07   3.461 0.000538 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23350  on 19657  degrees of freedom
## Residual deviance: 14279  on 19644  degrees of freedom
## AIC: 14307
## 
## Number of Fisher Scoring iterations: 7

This does not seem to give much deeper insight. We investigate the incorrectly classified observations:

v1.incorrect <- balanced.holdout.test %>% 
  mutate(.pred = v1.metrica.format$predictions) %>% 
  mutate(correct = .pred==CL) %>% 
  select(Code, Name, Country, Year, CL, correct, everything())

v1.incorrect %>% summary() # there is a 1787 to 4768 split between true CL absent vs. present
##       Code             Name               Country          Year      CL      
##  Min.   :  10101   Length:6555        Brazil  :4272   Min.   :2001   0:1787  
##  1st Qu.: 120608   Class :character   Colombia: 317   1st Qu.:2008   1:4768  
##  Median :1503457   Mode  :character   Peru    :1966   Median :2012           
##  Mean   :1783845                                      Mean   :2012           
##  3rd Qu.:2107506                                      3rd Qu.:2015           
##  Max.   :5300108                                      Max.   :2020           
##                                                                              
##   correct             NDVI             EVI            LST_Day     
##  Mode :logical   Min.   :0.1767   Min.   :0.1016   Min.   :11.77  
##  FALSE:1198      1st Qu.:0.5453   1st Qu.:0.3354   1st Qu.:25.95  
##  TRUE :5357      Median :0.6172   Median :0.3995   Median :28.82  
##                  Mean   :0.5928   Mean   :0.3809   Mean   :27.89  
##                  3rd Qu.:0.6722   3rd Qu.:0.4507   3rd Qu.:31.27  
##                  Max.   :0.8208   Max.   :0.5459   Max.   :36.71  
##                                                                   
##   min_LST_Day      max_LST_Day      LST_Night      min_LST_Night    
##  Min.   :-8.793   Min.   :18.22   Min.   :-5.268   Min.   :-37.750  
##  1st Qu.:17.961   1st Qu.:31.03   1st Qu.:15.358   1st Qu.:  5.081  
##  Median :23.644   Median :34.80   Median :20.881   Median : 15.916  
##  Mean   :20.851   Mean   :34.77   Mean   :17.029   Mean   : 10.702  
##  3rd Qu.:25.447   3rd Qu.:38.58   3rd Qu.:21.779   3rd Qu.: 18.420  
##  Max.   :31.086   Max.   :48.22   Max.   :24.953   Max.   : 22.886  
##                                                                     
##  max_LST_Night         HNTL             Precip       Evap_tavg        
##  Min.   :-1.856   Min.   : 0.0000   Min.   : 184   Min.   :1.718e-06  
##  1st Qu.:19.443   1st Qu.: 0.1043   1st Qu.:1179   1st Qu.:3.313e-05  
##  Median :23.256   Median : 0.5435   Median :1636   Median :4.163e-05  
##  Mean   :19.940   Mean   : 1.9367   Mean   :1702   Mean   :3.879e-05  
##  3rd Qu.:24.188   3rd Qu.: 2.3949   3rd Qu.:2102   3rd Qu.:4.581e-05  
##  Max.   :28.188   Max.   :60.8290   Max.   :4831   Max.   :6.252e-05  
##                                                                       
##   Qair_f_tavg       SoilMoi00_10cm_tavg  Wind_f_tavg     Tair_f_tavg   
##  Min.   :0.004301   Min.   :0.1988      Min.   :1.412   Min.   :273.6  
##  1st Qu.:0.011649   1st Qu.:0.3041      1st Qu.:2.354   1st Qu.:294.6  
##  Median :0.014090   Median :0.3297      Median :3.111   Median :298.9  
##  Mean   :0.013457   Mean   :0.3339      Mean   :3.169   Mean   :295.3  
##  3rd Qu.:0.016298   3rd Qu.:0.3618      3rd Qu.:3.874   3rd Qu.:299.8  
##  Max.   :0.018958   Max.   :0.4281      Max.   :6.758   Max.   :302.2  
##                                                                        
##    Population          min_Precip         max_Precip     min_Precip_Month
##  Min.   :    111.6   Min.   :  0.0000   Min.   : 31.54   Min.   : 1.000  
##  1st Qu.:   3875.6   1st Qu.:  0.3931   1st Qu.:243.89   1st Qu.: 6.000  
##  Median :   8904.1   Median :  6.9627   Median :331.79   Median : 7.000  
##  Mean   :  24261.9   Mean   : 19.8774   Mean   :329.78   Mean   : 7.245  
##  3rd Qu.:  19207.4   3rd Qu.: 22.2811   3rd Qu.:403.59   3rd Qu.: 8.000  
##  Max.   :2791764.8   Max.   :281.0092   Max.   :949.69   Max.   :12.000  
##                                                                          
##  max_Precip_Month forest_density   SWChange_abs       SWChange_norm     
##  Min.   : 1.000   Min.   : 0.00   Min.   :-128.0000   Min.   :-128.000  
##  1st Qu.: 2.000   1st Qu.: 7.28   1st Qu.:  -3.6756   1st Qu.:   1.579  
##  Median : 3.000   Median :23.04   Median :   0.8671   Median :  15.067  
##  Mean   : 4.149   Mean   :33.27   Mean   :   2.6775   Mean   :  21.892  
##  3rd Qu.: 5.000   3rd Qu.:57.09   3rd Qu.:   8.1409   3rd Qu.:  39.375  
##  Max.   :12.000   Max.   :99.65   Max.   :  84.6450   Max.   : 100.000  
##                                   NA's   :85          NA's   :85        
##   SWOccurrence    SWRecurrence    SWSeasonality    max_Elevation 
##  Min.   : 0.00   Min.   : 24.00   Min.   : 1.000   Min.   :  29  
##  1st Qu.:41.24   1st Qu.: 77.28   1st Qu.: 5.324   1st Qu.: 349  
##  Median :60.27   Median : 85.10   Median : 7.271   Median : 646  
##  Mean   :58.09   Mean   : 83.49   Mean   : 7.172   Mean   :1528  
##  3rd Qu.:76.43   3rd Qu.: 91.27   3rd Qu.: 9.103   3rd Qu.:2763  
##  Max.   :98.41   Max.   :100.00   Max.   :11.908   Max.   :6544  
##  NA's   :85      NA's   :85       NA's   :112                    
##  mean_Elevation     min_Elevation    var_Elevation            fold      
##  Min.   :   3.383   Min.   :  -3.0   Min.   :     10.9   Min.   :1.000  
##  1st Qu.: 170.384   1st Qu.:  82.0   1st Qu.:    901.2   1st Qu.:1.000  
##  Median : 320.461   Median : 199.0   Median :   4683.5   Median :3.000  
##  Mean   :1006.483   Mean   : 631.1   Mean   :  89736.7   Mean   :2.126  
##  3rd Qu.:1125.551   3rd Qu.: 544.0   3rd Qu.:  69556.4   3rd Qu.:3.000  
##  Max.   :4814.186   Max.   :4350.0   Max.   :2298170.4   Max.   :3.000  
##                                                                         
##  forest_fragmentation land_use_change      edge_loss             long         
##  Min.   :     0.0     Min.   :-7.70056   Min.   :-2677290   Min.   :-8863907  
##  1st Qu.:   686.8     1st Qu.:-0.25048   1st Qu.:  -13590   1st Qu.:-8226324  
##  Median :  1724.1     Median :-0.01798   Median :     480   Median :-6295076  
##  Mean   : 14365.0     Mean   :-0.17601   Mean   :   27872   Mean   :-6679746  
##  3rd Qu.:  5751.9     3rd Qu.: 0.03911   3rd Qu.:   29940   3rd Qu.:-5380194  
##  Max.   :750054.3     Max.   : 6.55408   Max.   : 4414320   Max.   :-4851417  
##                                                                               
##       lat           .pred   
##  Min.   :-2124011   0:2057  
##  1st Qu.:-1435452   1:4498  
##  Median : -952690           
##  Mean   : -934598           
##  3rd Qu.: -506889           
##  Max.   :  649268           
## 
# plots
v1.incorrect.important_vars <- v1.incorrect %>% 
  select(Code, Name, Year, CL, correct, v1.imp_vars$var) %>%
  mutate(log_Population = log(Population)) %>% 
  select(-Population) %>% 
  pivot_longer(cols = c(min_Elevation:log_Population))

ggplot(v1.incorrect.important_vars) +
  geom_boxplot(aes(x = correct, 
                   y = value)) +
  facet_wrap(facets='name', scales='free')

ggplot(v1.incorrect.important_vars) +
  geom_density(aes(x = value,
                   col = correct)) +
  facet_wrap(facets='name', scales='free')

The main takeaways here are that incorrectly identified data seem to have lower forest density, lower population, higher wind speeds than correctly identified data.

Using resampling method v2 (balanced_data_v2)

We go on now to use the balanced_data_v2 data set.

set.seed(123)
current_j = 2001

for(j in current_j:max(balanced_data_v2$Year)){
  for(i in balanced_data_v2$fold %>% unique() %>% sort()){
    print(paste(i,j, sep="_"))
    # first split using i and j
    datrain <- balanced_data_v2 %>% 
      filter(Year != j,
             fold != i)
    datest <- balanced_data_v2 %>% 
      filter(Year == j,
             fold == i)
    # if the data is too imbalanced, then we skip to the next iteration i_j
    if(datest$CL %>% unique() %>% length() == 1) {next}
    if(nrow(datest) == 0) {next}
    
    # second split (on datrain) before testing on datest
    datrain.split <- initial_split(datrain)
    datrain.train <- training(datrain.split)
    datrain.test <- testing(datrain.split)
  
    # get initial model
    init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
    
    # calculate out of sample model performance on datrain.test
    oob.datrain.test <- predict(init.mod, newdata=datrain.test, type='response') 
    auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
    best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datrain.test <- data.frame(
      cbind(ifelse(datrain.test$CL==1, 1, 0),
            ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    
    colnames(metrica.format.datrain.test) <- c("labels","predictions")
    rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
    
   metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
    
    sensitivity_out.datrain.test <- caret::recall(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    ) 
    specificity_out.datrain.test <- caret::precision(
      data = metrica.format.datrain.test$table, 
      relevant = rownames(metrica.format.datrain.test$table)[2]
    )
    
    var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable # picks top ten features
    
    
    # for retrieval later
    datrain.test.metrics <- list(
      auc = auc_out.datrain.test,
      best_threshold = best_threshold_out.datrain.test,
      confusion_matrix = metrica.format.datrain.test,
      sensitivity = sensitivity_out.datrain.test,
      specificity = specificity_out.datrain.test,
      imp_vars = var_imp_vars.datrain.test
    )
    
    edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
    ## second model on all of datrain
    edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
    edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
    
    oob.datest <- predict(edited.mod, newdata=datest, type='response')
    
    auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
    
    best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
    metrica.format.datest <- data.frame(
      cbind(ifelse(datest$CL==1, 1, 0),
            ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>% 
      mutate_all(as.factor)
    colnames(metrica.format.datest) <- c("labels","predictions")
    rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
    
    metrica.format.datest <- metrica.format.datest$predictions %>% 
      confusionMatrix(positive='1', reference=metrica.format.datest$labels)
    
    sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table, 
                                                  relevant = rownames(metrica.format.datest$table)[2]) 
    specificity_out.datest <- caret::precision(data = metrica.format.datest$table, 
                                                     relevant = rownames(metrica.format.datest$table)[2])
    
    var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
    
    # for retrieval later
    datest.metrics <- list(
      auc = auc_out.datest,
      best_threshold = best_threshold_out.datest,
      confusion_matrix = metrica.format.datest,
      sensitivity = sensitivity_out.datest,
      specificity = specificity_out.datest,
      imp_vars = var_imp_vars.datest)
    
    saveRDS(datrain.test.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datrain.test/", i, "_", j, "_metrics.rds"))
    saveRDS(datest.metrics,
            paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
    saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models.v2/", i, "_", j, "_edited_mod.rds"))
    }
}

Using the above models, let’s find a good final threshold and variables to include in the final model, like we did for balanced_data_v1.

# create empty vectors to be filled
threshold <- c()
imp.vars <- c()
auc <- c()
p_val <- c()

for(j in resume_year:max(balanced_data_v2$Year)){
  for(i in balanced_data_v2$fold %>% unique() %>% sort()){
    # make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
    if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))) {next} 
    
    this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
    
    threshold <- c(threshold, this.metrics$best_threshold[[1]])
    imp.vars <- c(imp.vars, this.metrics$imp_vars)
    auc <- c(auc, this.metrics$auc$auc[[1]])
    p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
    
  }
}

# get mean threshold
mean.threshold <- mean(threshold)

# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
# get count of each variable to rank general importance to apply to main data
v2.imp.vars.count <- imp.vars.df %>% 
  count(var) %>% 
  arrange(desc(n))

# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)

metrics_df.balanced_v2 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v2
##             metric     value
## 1              AUC 0.7526243
## 2 Accuracy P-value 0.1034658
## 3        Threshold 0.2897455

We compare this to the results of metrics_df.balanced_v1:

metrics_df.balanced_v1
##             metric     value
## 1              AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3        Threshold 0.8011194
# change meaning of v2.datrain and v2.datest to balanced.holdout.train and balanced.holdout.test

set.seed(123)
v2.imp_vars <- v2.imp.vars.count %>% 
  select(var) %>% 
  filter(row_number() <= 10) %>% 
  mutate(var = as.character(var))

v2.predictor_names <- c(v2.imp_vars$var, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

v2.threshold <- metrics_df.balanced_v2 %>% filter(metric=='Threshold') %>% select(value) %>% as.numeric()

v2.ml_formula <- as.formula(paste0("CL ~", paste(v2.predictor_names, collapse=" + ")))

# get testing and training split
split <- initial_split(data, strata = CL)
# v2.datrain <- training(split)
# v2.datest <- testing(split)

v2.datrain <- balanced.holdout.train
v2.datest <- balanced.holdout.test

# get model
v2.mod <- glm(v2.ml_formula, data=v2.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on v2.datest
v2.oob <- predict(v2.mod, newdata=v2.datest, type='response', na.action = na.pass)

v2.temp_auc_out <- pROC::roc(response = v2.datest$CL, predictor= v2.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v2.best_threshold_out <- pROC::coords(v2.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
v2.metrica.format <- data.frame(
  cbind(ifelse(v2.datest$CL==1, 1, 0),
        ifelse(v2.oob>=v2.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(v2.metrica.format) <- c("labels","predictions")
rownames(v2.metrica.format) <- 1:dim(v2.metrica.format)[1]

v2.auc_out <- pROC::roc(response = v2.metrica.format$labels, 
                        predictor = v2.metrica.format$predictions %>% as.numeric() - 1,
                        levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v2.metrica.format.confusionMatrix <- v2.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=v2.metrica.format$labels)

v2.sensitivity_out <- caret::recall(data = v2.metrica.format.confusionMatrix$table, 
                                    relevant = rownames(v2.metrica.format.confusionMatrix$table)[2]) 
v2.specificity_out <- caret::precision(data = v2.metrica.format.confusionMatrix$table, 
                                                 relevant = rownames(v2.metrica.format.confusionMatrix$table)[2])

v2.var_imp_vars <- vip::vip(v2.mod, num_features=20)
v2.var_imp_vars

The variable importance plot looks very similar to that of balanced_data_v1.

v1.var_imp_vars

v2.incorrect <- v2.datest %>% 
  # filter(row_number() %in% which(v2.metrica.format$labels != v2.metrica.format$predictions))
  mutate(.pred = v2.metrica.format$predictions) %>% 
  mutate(correct = .pred==CL) %>% 
  select(Code, Name, Country, Year, CL, correct, everything())

v2.incorrect %>% summary() 
##       Code             Name               Country          Year      CL      
##  Min.   :  10101   Length:6555        Brazil  :4272   Min.   :2001   0:1787  
##  1st Qu.: 120608   Class :character   Colombia: 317   1st Qu.:2008   1:4768  
##  Median :1503457   Mode  :character   Peru    :1966   Median :2012           
##  Mean   :1783845                                      Mean   :2012           
##  3rd Qu.:2107506                                      3rd Qu.:2015           
##  Max.   :5300108                                      Max.   :2020           
##                                                                              
##   correct             NDVI             EVI            LST_Day     
##  Mode :logical   Min.   :0.1767   Min.   :0.1016   Min.   :11.77  
##  FALSE:1316      1st Qu.:0.5453   1st Qu.:0.3354   1st Qu.:25.95  
##  TRUE :5239      Median :0.6172   Median :0.3995   Median :28.82  
##                  Mean   :0.5928   Mean   :0.3809   Mean   :27.89  
##                  3rd Qu.:0.6722   3rd Qu.:0.4507   3rd Qu.:31.27  
##                  Max.   :0.8208   Max.   :0.5459   Max.   :36.71  
##                                                                   
##   min_LST_Day      max_LST_Day      LST_Night      min_LST_Night    
##  Min.   :-8.793   Min.   :18.22   Min.   :-5.268   Min.   :-37.750  
##  1st Qu.:17.961   1st Qu.:31.03   1st Qu.:15.358   1st Qu.:  5.081  
##  Median :23.644   Median :34.80   Median :20.881   Median : 15.916  
##  Mean   :20.851   Mean   :34.77   Mean   :17.029   Mean   : 10.702  
##  3rd Qu.:25.447   3rd Qu.:38.58   3rd Qu.:21.779   3rd Qu.: 18.420  
##  Max.   :31.086   Max.   :48.22   Max.   :24.953   Max.   : 22.886  
##                                                                     
##  max_LST_Night         HNTL             Precip       Evap_tavg        
##  Min.   :-1.856   Min.   : 0.0000   Min.   : 184   Min.   :1.718e-06  
##  1st Qu.:19.443   1st Qu.: 0.1043   1st Qu.:1179   1st Qu.:3.313e-05  
##  Median :23.256   Median : 0.5435   Median :1636   Median :4.163e-05  
##  Mean   :19.940   Mean   : 1.9367   Mean   :1702   Mean   :3.879e-05  
##  3rd Qu.:24.188   3rd Qu.: 2.3949   3rd Qu.:2102   3rd Qu.:4.581e-05  
##  Max.   :28.188   Max.   :60.8290   Max.   :4831   Max.   :6.252e-05  
##                                                                       
##   Qair_f_tavg       SoilMoi00_10cm_tavg  Wind_f_tavg     Tair_f_tavg   
##  Min.   :0.004301   Min.   :0.1988      Min.   :1.412   Min.   :273.6  
##  1st Qu.:0.011649   1st Qu.:0.3041      1st Qu.:2.354   1st Qu.:294.6  
##  Median :0.014090   Median :0.3297      Median :3.111   Median :298.9  
##  Mean   :0.013457   Mean   :0.3339      Mean   :3.169   Mean   :295.3  
##  3rd Qu.:0.016298   3rd Qu.:0.3618      3rd Qu.:3.874   3rd Qu.:299.8  
##  Max.   :0.018958   Max.   :0.4281      Max.   :6.758   Max.   :302.2  
##                                                                        
##    Population          min_Precip         max_Precip     min_Precip_Month
##  Min.   :    111.6   Min.   :  0.0000   Min.   : 31.54   Min.   : 1.000  
##  1st Qu.:   3875.6   1st Qu.:  0.3931   1st Qu.:243.89   1st Qu.: 6.000  
##  Median :   8904.1   Median :  6.9627   Median :331.79   Median : 7.000  
##  Mean   :  24261.9   Mean   : 19.8774   Mean   :329.78   Mean   : 7.245  
##  3rd Qu.:  19207.4   3rd Qu.: 22.2811   3rd Qu.:403.59   3rd Qu.: 8.000  
##  Max.   :2791764.8   Max.   :281.0092   Max.   :949.69   Max.   :12.000  
##                                                                          
##  max_Precip_Month forest_density   SWChange_abs       SWChange_norm     
##  Min.   : 1.000   Min.   : 0.00   Min.   :-128.0000   Min.   :-128.000  
##  1st Qu.: 2.000   1st Qu.: 7.28   1st Qu.:  -3.6756   1st Qu.:   1.579  
##  Median : 3.000   Median :23.04   Median :   0.8671   Median :  15.067  
##  Mean   : 4.149   Mean   :33.27   Mean   :   2.6775   Mean   :  21.892  
##  3rd Qu.: 5.000   3rd Qu.:57.09   3rd Qu.:   8.1409   3rd Qu.:  39.375  
##  Max.   :12.000   Max.   :99.65   Max.   :  84.6450   Max.   : 100.000  
##                                   NA's   :85          NA's   :85        
##   SWOccurrence    SWRecurrence    SWSeasonality    max_Elevation 
##  Min.   : 0.00   Min.   : 24.00   Min.   : 1.000   Min.   :  29  
##  1st Qu.:41.24   1st Qu.: 77.28   1st Qu.: 5.324   1st Qu.: 349  
##  Median :60.27   Median : 85.10   Median : 7.271   Median : 646  
##  Mean   :58.09   Mean   : 83.49   Mean   : 7.172   Mean   :1528  
##  3rd Qu.:76.43   3rd Qu.: 91.27   3rd Qu.: 9.103   3rd Qu.:2763  
##  Max.   :98.41   Max.   :100.00   Max.   :11.908   Max.   :6544  
##  NA's   :85      NA's   :85       NA's   :112                    
##  mean_Elevation     min_Elevation    var_Elevation            fold      
##  Min.   :   3.383   Min.   :  -3.0   Min.   :     10.9   Min.   :1.000  
##  1st Qu.: 170.384   1st Qu.:  82.0   1st Qu.:    901.2   1st Qu.:1.000  
##  Median : 320.461   Median : 199.0   Median :   4683.5   Median :3.000  
##  Mean   :1006.483   Mean   : 631.1   Mean   :  89736.7   Mean   :2.126  
##  3rd Qu.:1125.551   3rd Qu.: 544.0   3rd Qu.:  69556.4   3rd Qu.:3.000  
##  Max.   :4814.186   Max.   :4350.0   Max.   :2298170.4   Max.   :3.000  
##                                                                         
##  forest_fragmentation land_use_change      edge_loss             long         
##  Min.   :     0.0     Min.   :-7.70056   Min.   :-2677290   Min.   :-8863907  
##  1st Qu.:   686.8     1st Qu.:-0.25048   1st Qu.:  -13590   1st Qu.:-8226324  
##  Median :  1724.1     Median :-0.01798   Median :     480   Median :-6295076  
##  Mean   : 14365.0     Mean   :-0.17601   Mean   :   27872   Mean   :-6679746  
##  3rd Qu.:  5751.9     3rd Qu.: 0.03911   3rd Qu.:   29940   3rd Qu.:-5380194  
##  Max.   :750054.3     Max.   : 6.55408   Max.   : 4414320   Max.   :-4851417  
##                                                                               
##       lat           .pred   
##  Min.   :-2124011   0:2333  
##  1st Qu.:-1435452   1:4222  
##  Median : -952690           
##  Mean   : -934598           
##  3rd Qu.: -506889           
##  Max.   :  649268           
## 
# there is a 641 to 959 split between true CL absent vs. present

# plots
v2.incorrect.important_vars <- v2.incorrect %>% 
  select(Code, Name, Year, CL, correct, v2.imp_vars$var) %>%
  mutate(log_Population = log(Population)) %>% 
  select(-Population) %>% 
  pivot_longer(cols = min_Elevation:log_Population)

ggplot(v2.incorrect.important_vars) +
  geom_boxplot(aes(x = correct, 
                   y = value)) +
  facet_wrap(facets='name', scales='free')

ggplot(v2.incorrect.important_vars) +
  geom_density(aes(x = value,
                   col = correct)) +
  facet_wrap(facets='name', scales='free')

Compare the final model performances after getting parameters from balanced_data_v1 compared to balanced_data_v2

v1.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1323  734
##          1  464 4034
##                                           
##                Accuracy : 0.8172          
##                  95% CI : (0.8077, 0.8265)
##     No Information Rate : 0.7274          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.56            
##                                           
##  Mcnemar's Test P-Value : 7.735e-15       
##                                           
##             Sensitivity : 0.8461          
##             Specificity : 0.7403          
##          Pos Pred Value : 0.8968          
##          Neg Pred Value : 0.6432          
##              Prevalence : 0.7274          
##          Detection Rate : 0.6154          
##    Detection Prevalence : 0.6862          
##       Balanced Accuracy : 0.7932          
##                                           
##        'Positive' Class : 1               
## 
v1.auc_out %>% plot()

v1.auc_out$auc
## Area under the curve: 0.7932
v2.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1402  931
##          1  385 3837
##                                           
##                Accuracy : 0.7992          
##                  95% CI : (0.7893, 0.8089)
##     No Information Rate : 0.7274          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5379          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8047          
##             Specificity : 0.7846          
##          Pos Pred Value : 0.9088          
##          Neg Pred Value : 0.6009          
##              Prevalence : 0.7274          
##          Detection Rate : 0.5854          
##    Detection Prevalence : 0.6441          
##       Balanced Accuracy : 0.7946          
##                                           
##        'Positive' Class : 1               
## 
v2.auc_out %>% plot()

v2.auc_out$auc
## Area under the curve: 0.7946

Now we can return our focus to using a weighted approach to choosing a threshold and predictor variables using this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]. Let’s try it on the metrics produced by balanced_data_v1

Weighted approach

For this approach, we get two sets of the top ranked variables based on the AUC and accuracy p-value of their respective models then test them separately.

balanced_data_v1

# create empty vectors to be filled
threshold <- c()
imp.vars <- data.frame(variable = "", auc = "", p_val = "")
auc <- c()
p_val <- c()

for(j in resume_year:max(balanced_data_v1$Year)){
  for(i in balanced_data_v1$fold %>% unique() %>% sort()){
    # make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
    if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))) {next} 
    
    this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
    
    threshold <- c(threshold, this.metrics$best_threshold[[1]])
    imp.vars <- imp.vars %>% 
      rbind(data.frame(variable = this.metrics$imp_vars, auc = this.metrics$auc$auc[[1]], p_val = this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]))
    auc <- c(auc, this.metrics$auc$auc[[1]])
    p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
    
  }
}

# get mean threshold
mean.threshold <- mean(threshold)

# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
# get count of each variable to rank general importance to apply to main data
v1.imp.vars.count <- imp.vars.df %>% 
  count(var) %>% 
  arrange(desc(n))

# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)

imp.vars.df.ranked <- imp.vars %>% 
  filter(row_number() > 1) %>% 
  mutate(auc = as.numeric(auc),
         p_val = as.numeric(p_val),
         auc.rank = rank(auc),
         p_val.rank = rank(p_val)) %>% 
  group_by(variable) %>% 
  summarise(mean.auc = mean(auc),
            mean.p_val = mean(p_val)) 

# get auc vars
weighted_v1.auc.vars <- imp.vars.df.ranked %>% 
  arrange(desc(mean.auc))

# get p-val vars
weighted_v1.p_val.vars <- imp.vars.df.ranked %>% 
  arrange((mean.p_val))

metrics_df.balanced_v1 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v1
##             metric     value
## 1              AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3        Threshold 0.8011194
# Select the top 10 for each set
weighted_v1.auc.imp_vars <- weighted_v1.auc.vars %>% 
  filter(row_number() <= 10)

weighted_v1.p_val.imp_vars <- weighted_v1.p_val.vars %>% 
  filter(row_number() <= 10)
# weighted_v1.datrain/datest -> holdout.train/test

set.seed(123)
## auc variables first ##
weighted_v1.auc.predictor_names <- c(weighted_v1.auc.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

weighted_v1.auc.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v1.auc.predictor_names, collapse=" + ")))

# get testing and training split
# split <- initial_split(data, strata = CL)
# weighted_v1.datrain <- training(split)
# weighted_v1.datest <- testing(split)

weighted_v1.datrain <- balanced.holdout.train
weighted_v1.datest <- balanced.holdout.test

# get model
weighted_v1.auc.mod <- glm(weighted_v1.auc.ml_formula, data=weighted_v1.datrain, family=binomial)

# calculate out of sample model performance on v1.datest
weighted_v1.auc.oob <- predict(weighted_v1.auc.mod, newdata=weighted_v1.datest, type='response', na.action = na.pass)

weighted_v1.auc.temp_auc_out <- pROC::roc(response = weighted_v1.datest$CL, predictor= weighted_v1.auc.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.auc.best_threshold_out <- pROC::coords(weighted_v1.auc.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v1.auc.metrica.format <- data.frame(
  cbind(ifelse(weighted_v1.datest$CL==1, 1, 0),
        ifelse(weighted_v1.auc.oob>=weighted_v1.auc.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(weighted_v1.auc.metrica.format) <- c("labels","predictions")
rownames(weighted_v1.auc.metrica.format) <- 1:dim(weighted_v1.auc.metrica.format)[1]

weighted_v1.auc.auc_out <- pROC::roc(response = weighted_v1.auc.metrica.format$labels, predictor= weighted_v1.auc.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.auc.metrica.format.confusionMatrix <- weighted_v1.auc.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=weighted_v1.auc.metrica.format$labels)

weighted_v1.auc.sensitivity_out <- caret::recall(data = weighted_v1.auc.metrica.format.confusionMatrix$table, 
                                    relevant = rownames(weighted_v1.auc.metrica.format.confusionMatrix$table)[2]) 
weighted_v1.auc.specificity_out <- caret::precision(data = weighted_v1.auc.metrica.format.confusionMatrix$table, 
                                                 relevant = rownames(weighted_v1.auc.metrica.format.confusionMatrix$table)[2])

weighted_v1.auc.var_imp_vars <- vip::vip(weighted_v1.auc.mod)
set.seed(123)
## auc variables first ##
weighted_v1.p_val.predictor_names <- c(weighted_v1.p_val.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

weighted_v1.p_val.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v1.p_val.predictor_names, collapse=" + ")))

# get testing and training split
# split <- initial_split(data, strata = CL)
# weighted_v1.datrain <- training(split)
# weighted_v1.datest <- testing(split)
weighted_1.datrain <- balanced.holdout.train
weighted_v1.datest <- balanced.holdout.test

# get model
weighted_v1.p_val.mod <- glm(weighted_v1.p_val.ml_formula, data=weighted_v1.datrain, family=binomial)

# calculate out of sample model performance on weighted_v1.datest
weighted_v1.p_val.oob <- predict(weighted_v1.p_val.mod, newdata=weighted_v1.datest, type='response', na.action = na.pass)

weighted_v1.p_val.temp_auc_out <- pROC::roc(response = weighted_v1.datest$CL, predictor= weighted_v1.p_val.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.p_val.best_threshold_out <- pROC::coords(weighted_v1.p_val.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v1.p_val.metrica.format <- data.frame(
  cbind(ifelse(weighted_v1.datest$CL==1, 1, 0),
        ifelse(weighted_v1.p_val.oob>=weighted_v1.p_val.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(weighted_v1.p_val.metrica.format) <- c("labels","predictions")
rownames(weighted_v1.p_val.metrica.format) <- 1:dim(weighted_v1.p_val.metrica.format)[1]

weighted_v1.p_val.auc_out <- pROC::roc(response = weighted_v1.p_val.metrica.format$labels, predictor= weighted_v1.p_val.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.p_val.metrica.format.confusionMatrix <- weighted_v1.p_val.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=weighted_v1.p_val.metrica.format$labels)

weighted_v1.p_val.sensitivity_out <- caret::recall(data = weighted_v1.p_val.metrica.format.confusionMatrix$table, 
                                    relevant = rownames(weighted_v1.p_val.metrica.format.confusionMatrix$table)[2]) 
weighted_v1.p_val.specificity_out <- caret::precision(data = weighted_v1.p_val.metrica.format.confusionMatrix$table, 
                                                 relevant = rownames(weighted_v1.p_val.metrica.format.confusionMatrix$table)[2])

weighted_v1.p_val.var_imp_vars <- vip::vip(weighted_v1.p_val.mod)

Both models severely underperform when classifying absent points.

balanced_data_v2

# create empty vectors to be filled
threshold <- c()
imp.vars <- data.frame(variable = "", auc = "", p_val = "")
auc <- c()
p_val <- c()

for(j in resume_year:max(balanced_data_v2$Year)){
  for(i in balanced_data_v2$fold %>% unique() %>% sort()){
    # make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
    if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))) {next} 
    
    this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
    
    threshold <- c(threshold, this.metrics$best_threshold[[1]])
    imp.vars <- imp.vars %>% 
      rbind(data.frame(variable = this.metrics$imp_vars, auc = this.metrics$auc$auc[[1]], p_val = this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]))
    auc <- c(auc, this.metrics$auc$auc[[1]])
    p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
    
  }
}

# get mean threshold
mean.threshold <- mean(threshold)

# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
# get count of each variable to rank general importance to apply to main data
v2.imp.vars.count <- imp.vars.df %>% 
  count(var) %>% 
  arrange(desc(n))

# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)

imp.vars.df.ranked <- imp.vars %>% 
  filter(row_number() > 1) %>% 
  mutate(auc = as.numeric(auc),
         p_val = as.numeric(p_val),
         auc.rank = rank(auc),
         p_val.rank = rank(p_val)) %>% 
  group_by(variable) %>% 
  summarise(mean.auc.rank = mean(auc),
            mean.p_val.rank = mean(p_val)) 

weighted_v2.auc.vars <- imp.vars.df.ranked %>% 
  arrange(desc(mean.auc.rank))

weighted_v2.p_val.vars <- imp.vars.df.ranked %>% 
  arrange((mean.p_val.rank))

metrics_df.balanced_v2 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v2
##             metric     value
## 1              AUC 0.7526243
## 2 Accuracy P-value 0.1034658
## 3        Threshold 0.2897455
weighted_v2.auc.imp_vars <- weighted_v2.auc.vars %>% 
  filter(row_number() <= 10)

weighted_v2.p_val.imp_vars <- weighted_v2.p_val.vars %>% 
  filter(row_number() <= 10)
set.seed(123)

## auc variables first ##
weighted_v2.auc.predictor_names <- c(weighted_v2.auc.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

weighted_v2.auc.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v2.auc.predictor_names, collapse=" + ")))

# get testing and training split
split <- initial_split(data, strata = CL)
# weighted_v2.datrain <- training(split)
# weighted_v2.datest <- testing(split)
weighted_v2.datrain <- balanced.holdout.train
weighted_v2.datest <- balanced.holdout.test

# get model
weighted_v2.auc.mod <- glm(weighted_v2.auc.ml_formula, data=weighted_v2.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on weighted_v2.datest
weighted_v2.auc.oob <- predict(weighted_v2.auc.mod, newdata=weighted_v2.datest, type='response', na.action = na.pass)

weighted_v2.auc.temp_auc_out <- pROC::roc(response = weighted_v2.datest$CL, predictor= weighted_v2.auc.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.auc.best_threshold_out <- pROC::coords(weighted_v2.auc.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v2.auc.metrica.format <- data.frame(
  cbind(ifelse(weighted_v2.datest$CL==1, 1, 0),
        ifelse(weighted_v2.auc.oob>=weighted_v2.auc.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(weighted_v2.auc.metrica.format) <- c("labels","predictions")
rownames(weighted_v2.auc.metrica.format) <- 1:dim(weighted_v2.auc.metrica.format)[1]

weighted_v2.auc.auc_out <- pROC::roc(response = weighted_v2.auc.metrica.format$labels, predictor= weighted_v2.auc.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.auc.metrica.format.confusionMatrix <- weighted_v2.auc.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=weighted_v2.auc.metrica.format$labels)

weighted_v2.auc.sensitivity_out <- caret::recall(data = weighted_v2.auc.metrica.format.confusionMatrix$table, 
                                    relevant = rownames(weighted_v2.auc.metrica.format.confusionMatrix$table)[2]) 
weighted_v2.auc.specificity_out <- caret::precision(data = weighted_v2.auc.metrica.format.confusionMatrix$table, 
                                                 relevant = rownames(weighted_v2.auc.metrica.format.confusionMatrix$table)[2])

weighted_v2.auc.var_imp_vars <- vip::vip(weighted_v2.auc.mod)
set.seed(123)
## auc variables first ##
weighted_v2.p_val.predictor_names <- c(weighted_v2.p_val.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
  unique()

weighted_v2.p_val.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v2.p_val.predictor_names, collapse=" + ")))

# get testing and training split
split <- initial_split(data, strata = CL)
# weighted_v2.datrain <- training(split)
# weighted_v2.datest <- testing(split)
weighted_v2.datrain <- balanced.holdout.train
weighted_v2.datest <- balanced.holdout.test

# get model
weighted_v2.p_val.mod <- glm(weighted_v2.p_val.ml_formula, data=weighted_v2.datrain, family=binomial)

# calculate out of sample model performance on weighted_v2.datest
weighted_v2.p_val.oob <- predict(weighted_v2.p_val.mod, newdata=weighted_v2.datest, type='response', na.action = na.pass)

weighted_v2.p_val.temp_auc_out <- pROC::roc(response = weighted_v2.datest$CL, predictor= weighted_v2.p_val.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.p_val.best_threshold_out <- pROC::coords(weighted_v2.p_val.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v2.p_val.metrica.format <- data.frame(
  cbind(ifelse(weighted_v2.datest$CL==1, 1, 0),
        ifelse(weighted_v2.p_val.oob>=weighted_v2.p_val.best_threshold_out[1,1], 1, 0))) %>% # 0.821
  mutate_all(as.factor)

colnames(weighted_v2.p_val.metrica.format) <- c("labels","predictions")
rownames(weighted_v2.p_val.metrica.format) <- 1:dim(weighted_v2.p_val.metrica.format)[1]

weighted_v2.p_val.auc_out <- pROC::roc(response = weighted_v2.p_val.metrica.format$labels, predictor= weighted_v2.p_val.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.p_val.metrica.format.confusionMatrix <- weighted_v2.p_val.metrica.format$predictions %>% 
  confusionMatrix(positive='1', reference=weighted_v2.p_val.metrica.format$labels)

weighted_v2.p_val.sensitivity_out <- caret::recall(data = weighted_v2.p_val.metrica.format.confusionMatrix$table, 
                                    relevant = rownames(weighted_v2.p_val.metrica.format.confusionMatrix$table)[2]) 
weighted_v2.p_val.specificity_out <- caret::precision(data = weighted_v2.p_val.metrica.format.confusionMatrix$table, 
                                                 relevant = rownames(weighted_v2.p_val.metrica.format.confusionMatrix$table)[2])

weighted_v2.p_val.var_imp_vars <- vip::vip(weighted_v2.p_val.mod)

Results

We open the results section with the confusion matrices and AUCs of all models.

Full variables:

full.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1248  600
##          1  488 4096
##                                           
##                Accuracy : 0.8308          
##                  95% CI : (0.8215, 0.8399)
##     No Information Rate : 0.7301          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5793          
##                                           
##  Mcnemar's Test P-Value : 0.0007649       
##                                           
##             Sensitivity : 0.8722          
##             Specificity : 0.7189          
##          Pos Pred Value : 0.8935          
##          Neg Pred Value : 0.6753          
##              Prevalence : 0.7301          
##          Detection Rate : 0.6368          
##    Detection Prevalence : 0.7127          
##       Balanced Accuracy : 0.7956          
##                                           
##        'Positive' Class : 1               
## 
full.auc_out
## 
## Call:
## roc.default(response = full.metrica.format$labels, predictor = full.metrica.format$predictions %>%     as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: full.metrica.format$predictions %>% as.numeric() - 1 in 1736 controls (full.metrica.format$labels 0) < 4696 cases (full.metrica.format$labels 1).
## Area under the curve: 0.7956

First unweighted resampling approach (v1):

v1.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1323  734
##          1  464 4034
##                                           
##                Accuracy : 0.8172          
##                  95% CI : (0.8077, 0.8265)
##     No Information Rate : 0.7274          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.56            
##                                           
##  Mcnemar's Test P-Value : 7.735e-15       
##                                           
##             Sensitivity : 0.8461          
##             Specificity : 0.7403          
##          Pos Pred Value : 0.8968          
##          Neg Pred Value : 0.6432          
##              Prevalence : 0.7274          
##          Detection Rate : 0.6154          
##    Detection Prevalence : 0.6862          
##       Balanced Accuracy : 0.7932          
##                                           
##        'Positive' Class : 1               
## 
v1.auc_out
## 
## Call:
## roc.default(response = v1.metrica.format$labels, predictor = v1.metrica.format$predictions %>%     as.ordered(), levels = c(0, 1), auc = TRUE)
## 
## Data: v1.metrica.format$predictions %>% as.ordered() in 1787 controls (v1.metrica.format$labels 0) < 4768 cases (v1.metrica.format$labels 1).
## Area under the curve: 0.7932

Second unweighted resampling approach (v2):

v2.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1402  931
##          1  385 3837
##                                           
##                Accuracy : 0.7992          
##                  95% CI : (0.7893, 0.8089)
##     No Information Rate : 0.7274          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5379          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8047          
##             Specificity : 0.7846          
##          Pos Pred Value : 0.9088          
##          Neg Pred Value : 0.6009          
##              Prevalence : 0.7274          
##          Detection Rate : 0.5854          
##    Detection Prevalence : 0.6441          
##       Balanced Accuracy : 0.7946          
##                                           
##        'Positive' Class : 1               
## 
v2.auc_out
## 
## Call:
## roc.default(response = v2.metrica.format$labels, predictor = v2.metrica.format$predictions %>%     as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: v2.metrica.format$predictions %>% as.numeric() - 1 in 1787 controls (v2.metrica.format$labels 0) < 4768 cases (v2.metrica.format$labels 1).
## Area under the curve: 0.7946

First weighted variables approach (v1):

weighted_v1.auc.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1142  595
##          1  587 4146
##                                           
##                Accuracy : 0.8173          
##                  95% CI : (0.8077, 0.8267)
##     No Information Rate : 0.7328          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5342          
##                                           
##  Mcnemar's Test P-Value : 0.8387          
##                                           
##             Sensitivity : 0.8745          
##             Specificity : 0.6605          
##          Pos Pred Value : 0.8760          
##          Neg Pred Value : 0.6575          
##              Prevalence : 0.7328          
##          Detection Rate : 0.6408          
##    Detection Prevalence : 0.7315          
##       Balanced Accuracy : 0.7675          
##                                           
##        'Positive' Class : 1               
## 
weighted_v1.auc.auc_out
## 
## Call:
## roc.default(response = weighted_v1.auc.metrica.format$labels,     predictor = weighted_v1.auc.metrica.format$predictions %>%         as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: weighted_v1.auc.metrica.format$predictions %>% as.numeric() - 1 in 1729 controls (weighted_v1.auc.metrica.format$labels 0) < 4741 cases (weighted_v1.auc.metrica.format$labels 1).
## Area under the curve: 0.7675
weighted_v1.p_val.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1044  479
##          1  664 4256
##                                          
##                Accuracy : 0.8226         
##                  95% CI : (0.813, 0.8319)
##     No Information Rate : 0.7349         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5284         
##                                          
##  Mcnemar's Test P-Value : 5.255e-08      
##                                          
##             Sensitivity : 0.8988         
##             Specificity : 0.6112         
##          Pos Pred Value : 0.8650         
##          Neg Pred Value : 0.6855         
##              Prevalence : 0.7349         
##          Detection Rate : 0.6606         
##    Detection Prevalence : 0.7636         
##       Balanced Accuracy : 0.7550         
##                                          
##        'Positive' Class : 1              
## 
weighted_v1.p_val.auc_out
## 
## Call:
## roc.default(response = weighted_v1.p_val.metrica.format$labels,     predictor = weighted_v1.p_val.metrica.format$predictions %>%         as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: weighted_v1.p_val.metrica.format$predictions %>% as.numeric() - 1 in 1708 controls (weighted_v1.p_val.metrica.format$labels 0) < 4735 cases (weighted_v1.p_val.metrica.format$labels 1).
## Area under the curve: 0.755

Second weighted variables approach (v2):

weighted_v2.auc.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1337  945
##          1  450 3823
##                                          
##                Accuracy : 0.7872         
##                  95% CI : (0.7771, 0.797)
##     No Information Rate : 0.7274         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5062         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.8018         
##             Specificity : 0.7482         
##          Pos Pred Value : 0.8947         
##          Neg Pred Value : 0.5859         
##              Prevalence : 0.7274         
##          Detection Rate : 0.5832         
##    Detection Prevalence : 0.6519         
##       Balanced Accuracy : 0.7750         
##                                          
##        'Positive' Class : 1              
## 
weighted_v2.auc.auc_out
## 
## Call:
## roc.default(response = weighted_v2.auc.metrica.format$labels,     predictor = weighted_v2.auc.metrica.format$predictions %>%         as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: weighted_v2.auc.metrica.format$predictions %>% as.numeric() - 1 in 1787 controls (weighted_v2.auc.metrica.format$labels 0) < 4768 cases (weighted_v2.auc.metrica.format$labels 1).
## Area under the curve: 0.775
weighted_v2.p_val.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1078  556
##          1  651 4185
##                                           
##                Accuracy : 0.8134          
##                  95% CI : (0.8037, 0.8229)
##     No Information Rate : 0.7328          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5152          
##                                           
##  Mcnemar's Test P-Value : 0.006817        
##                                           
##             Sensitivity : 0.8827          
##             Specificity : 0.6235          
##          Pos Pred Value : 0.8654          
##          Neg Pred Value : 0.6597          
##              Prevalence : 0.7328          
##          Detection Rate : 0.6468          
##    Detection Prevalence : 0.7474          
##       Balanced Accuracy : 0.7531          
##                                           
##        'Positive' Class : 1               
## 
weighted_v2.p_val.auc_out
## 
## Call:
## roc.default(response = weighted_v2.p_val.metrica.format$labels,     predictor = weighted_v2.p_val.metrica.format$predictions %>%         as.numeric() - 1, levels = c(0, 1), auc = TRUE)
## 
## Data: weighted_v2.p_val.metrica.format$predictions %>% as.numeric() - 1 in 1729 controls (weighted_v2.p_val.metrica.format$labels 0) < 4741 cases (weighted_v2.p_val.metrica.format$labels 1).
## Area under the curve: 0.7531

Using the weighted variable approaches give very poor performances for predicting absences. The objective “best” performing model is the model using all variables, but close behind are the unweighted variable approaches. These should take the place of the full variable approach since they use less variables, which improves generalizability.

full.incorrect <- full.datest %>% 
  # filter(row_number() %in% which(full.metrica.format$labels != full.metrica.format$predictions))
  mutate(.pred = full.metrica.format$predictions) %>% 
  mutate(correct = .pred==CL) %>% 
  select(Code, Name, Country, Year, CL, .pred, correct, everything())

full.incorrect %>% summary() 
##       Code             Name               Country          Year      CL      
##  Min.   :  10101   Length:6554        Brazil  :4299   Min.   :2001   0:1828  
##  1st Qu.: 120807   Class :character   Colombia: 339   1st Qu.:2008   1:4726  
##  Median :1503309   Mode  :character   Peru    :1916   Median :2012           
##  Mean   :1804953                                      Mean   :2011           
##  3rd Qu.:2107704                                      3rd Qu.:2015           
##  Max.   :5300108                                      Max.   :2020           
##                                                                              
##   .pred       correct             NDVI             EVI             LST_Day     
##  0   :1848   Mode :logical   Min.   :0.1748   Min.   :0.09244   Min.   :11.86  
##  1   :4584   FALSE:1088      1st Qu.:0.5450   1st Qu.:0.33562   1st Qu.:26.11  
##  NA's: 122   TRUE :5344      Median :0.6193   Median :0.40060   Median :28.69  
##              NA's :122       Mean   :0.5942   Mean   :0.38187   Mean   :27.87  
##                              3rd Qu.:0.6751   3rd Qu.:0.45251   3rd Qu.:31.23  
##                              Max.   :0.8208   Max.   :0.55824   Max.   :37.33  
##                                                                                
##   min_LST_Day      max_LST_Day      LST_Night      min_LST_Night    
##  Min.   :-15.19   Min.   :18.28   Min.   :-6.076   Min.   :-38.503  
##  1st Qu.: 18.13   1st Qu.:31.02   1st Qu.:15.678   1st Qu.:  5.795  
##  Median : 23.68   Median :34.77   Median :20.865   Median : 15.934  
##  Mean   : 20.86   Mean   :34.76   Mean   :17.048   Mean   : 10.830  
##  3rd Qu.: 25.37   3rd Qu.:38.62   3rd Qu.:21.792   3rd Qu.: 18.433  
##  Max.   : 30.41   Max.   :48.45   Max.   :24.856   Max.   : 22.929  
##                                                                     
##  max_LST_Night         HNTL             Precip       Evap_tavg        
##  Min.   :-1.545   Min.   : 0.0000   Min.   : 184   Min.   :1.500e-06  
##  1st Qu.:19.768   1st Qu.: 0.1057   1st Qu.:1177   1st Qu.:3.289e-05  
##  Median :23.243   Median : 0.5350   Median :1650   Median :4.175e-05  
##  Mean   :19.932   Mean   : 1.9441   Mean   :1714   Mean   :3.883e-05  
##  3rd Qu.:24.194   3rd Qu.: 2.4261   3rd Qu.:2114   3rd Qu.:4.585e-05  
##  Max.   :27.895   Max.   :62.4059   Max.   :4696   Max.   :6.196e-05  
##                                                                       
##   Qair_f_tavg       SoilMoi00_10cm_tavg  Wind_f_tavg     Tair_f_tavg   
##  Min.   :0.004105   Min.   :0.2003      Min.   :1.414   Min.   :273.6  
##  1st Qu.:0.011753   1st Qu.:0.3043      1st Qu.:2.329   1st Qu.:295.0  
##  Median :0.014116   Median :0.3302      Median :3.085   Median :298.9  
##  Mean   :0.013484   Mean   :0.3349      Mean   :3.147   Mean   :295.3  
##  3rd Qu.:0.016364   3rd Qu.:0.3643      3rd Qu.:3.854   3rd Qu.:299.8  
##  Max.   :0.018908   Max.   :0.4276      Max.   :6.770   Max.   :302.3  
##                                                                        
##    Population          min_Precip        max_Precip      min_Precip_Month
##  Min.   :    218.2   Min.   :  0.000   Min.   :  31.54   Min.   : 1.000  
##  1st Qu.:   3896.3   1st Qu.:  0.365   1st Qu.: 245.34   1st Qu.: 6.000  
##  Median :   9131.3   Median :  6.984   Median : 331.23   Median : 7.000  
##  Mean   :  25640.5   Mean   : 20.599   Mean   : 330.70   Mean   : 7.222  
##  3rd Qu.:  19547.5   3rd Qu.: 23.708   3rd Qu.: 407.55   3rd Qu.: 8.000  
##  Max.   :2791764.8   Max.   :266.818   Max.   :1267.35   Max.   :12.000  
##                                                                          
##  max_Precip_Month forest_density    SWChange_abs      SWChange_norm     
##  Min.   : 1.000   Min.   : 0.000   Min.   :-128.000   Min.   :-128.000  
##  1st Qu.: 2.000   1st Qu.: 7.266   1st Qu.:  -3.184   1st Qu.:   1.497  
##  Median : 3.000   Median :23.216   Median :   1.110   Median :  14.630  
##  Mean   : 4.147   Mean   :33.754   Mean   :   2.684   Mean   :  21.503  
##  3rd Qu.: 5.000   3rd Qu.:57.953   3rd Qu.:   8.046   3rd Qu.:  38.663  
##  Max.   :12.000   Max.   :99.628   Max.   :  84.645   Max.   : 100.000  
##                                    NA's   :90         NA's   :90        
##   SWOccurrence    SWRecurrence    SWSeasonality    max_Elevation 
##  Min.   : 0.00   Min.   : 24.00   Min.   : 1.000   Min.   :  29  
##  1st Qu.:41.98   1st Qu.: 77.46   1st Qu.: 5.340   1st Qu.: 344  
##  Median :60.37   Median : 85.27   Median : 7.290   Median : 642  
##  Mean   :58.52   Mean   : 83.64   Mean   : 7.233   Mean   :1516  
##  3rd Qu.:77.41   3rd Qu.: 91.56   3rd Qu.: 9.199   3rd Qu.:2700  
##  Max.   :98.41   Max.   :100.00   Max.   :11.908   Max.   :6607  
##  NA's   :92      NA's   :90       NA's   :122                    
##  mean_Elevation     min_Elevation    var_Elevation            fold      
##  Min.   :   3.383   Min.   :  -3.0   Min.   :     10.9   Min.   :1.000  
##  1st Qu.: 165.504   1st Qu.:  80.0   1st Qu.:    866.5   1st Qu.:1.000  
##  Median : 315.948   Median : 198.0   Median :   4559.3   Median :3.000  
##  Mean   :1003.001   Mean   : 628.7   Mean   :  89019.1   Mean   :2.126  
##  3rd Qu.:1054.828   3rd Qu.: 540.0   3rd Qu.:  66456.5   3rd Qu.:3.000  
##  Max.   :4814.186   Max.   :4350.0   Max.   :1871815.9   Max.   :3.000  
##                                                                         
##  forest_fragmentation land_use_change      edge_loss             long         
##  Min.   :     0.0     Min.   :-8.41190   Min.   :-2436870   Min.   :-8866471  
##  1st Qu.:   670.8     1st Qu.:-0.22060   1st Qu.:  -13590   1st Qu.:-8211734  
##  Median :  1724.6     Median :-0.01662   Median :     600   Median :-6313985  
##  Mean   : 16240.9     Mean   :-0.16779   Mean   :   28823   Mean   :-6677677  
##  3rd Qu.:  5934.7     3rd Qu.: 0.03734   3rd Qu.:   30360   3rd Qu.:-5389090  
##  Max.   :750054.3     Max.   : 5.30915   Max.   : 2926320   Max.   :-4851417  
##                                                                               
##       lat          
##  Min.   :-2124011  
##  1st Qu.:-1427949  
##  Median : -948492  
##  Mean   : -927481  
##  3rd Qu.: -500867  
##  Max.   :  649268  
## 
# plots
full.incorrect.important_vars <- full.incorrect %>% 
  select(Code, Name, Year, CL, correct, everything()) %>%
  mutate(log_Population = log(Population)) %>% 
  select(-Population) %>% 
  pivot_longer(cols = NDVI:log_Population)

ggplot(full.incorrect.important_vars) +
  geom_boxplot(aes(x = correct, 
                   y = value)) +
  facet_wrap(facets='name', scales='free', ncol = 3)
## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).

ggplot(full.incorrect.important_vars) +
  geom_density(aes(x = value,
                   col = correct)) +
  facet_wrap(facets='name', scales='free', ncol = 3)
## Warning: Removed 484 rows containing non-finite values (`stat_density()`).

Get spatial distribution of correctly and incorrectly classified municipios

# get a data frame that gives a percentage of how many instances each municipio was classified incorrectly
percent_wrong.df <- full.incorrect %>% 
  select(Code, Year, Country, correct, 
         land_use_change, forest_density, forest_fragmentation, edge_loss, mean_Elevation) %>% 
  group_by(Code) %>% # group by Code when summarizing since number of observations per municipio differ by country and code
  mutate(num_years = unique(Year) %>% length(),
         num_correct = sum(correct)) %>% 
  summarise(percent_wrong = 1 - (num_correct/num_years),
            median_deforested = median(land_use_change),
            max_forest_change = max(forest_density) - min(forest_density),
            mean_forest_fragmentation = mean(forest_fragmentation),
            mean_edge_loss = mean(edge_loss),
            mean_Elevation = mean(mean_Elevation)) %>% 
  ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'Code'. You can override using the
## `.groups` argument.
geom.df <- readRDS("~/peregrine_amazon/data/annual/aad_2021_forests_geom.rds") %>% 
  select(Code, geometry) %>% 
  unique()

percent_wrong.geom.df <- percent_wrong.df %>% 
  left_join(geom.df, by='Code') %>% 
  filter(!is.na(percent_wrong)) %>% 
  sf::st_as_sf() %>% 
  unique()

percent_wrong.all_incorrect.df <- percent_wrong.geom.df %>% 
  filter(percent_wrong == 1)

ggplot(percent_wrong.all_incorrect.df) +
  geom_density(aes(x=max_forest_change))

percent_wrong.no_correct.df <- percent_wrong.geom.df %>% 
  filter(percent_wrong > 0)

ggplot(percent_wrong.no_correct.df) +
  geom_density(aes(x=max_forest_change))

# maps; change parameters in zcol 
mapview::mapview(percent_wrong.all_incorrect.df, zcol = 'mean_forest_fragmentation')
mapview::mapview(percent_wrong.no_correct.df, zcol = 'mean_forest_fragmentation')
percent_wrong.no_correct.df %>% summary()
##       Code         percent_wrong    median_deforested   max_forest_change 
##  Min.   :  10101   Min.   :0.1250   Min.   :-2.105779   Min.   : 0.00000  
##  1st Qu.:  97777   1st Qu.:0.3333   1st Qu.:-0.113465   1st Qu.: 0.07426  
##  Median :1505007   Median :0.5000   Median :-0.004684   Median : 0.44214  
##  Mean   :1755515   Mean   :0.5654   Mean   :-0.054830   Mean   : 1.65983  
##  3rd Qu.:2110807   3rd Qu.:1.0000   3rd Qu.: 0.025067   3rd Qu.: 1.48892  
##  Max.   :5222302   Max.   :1.0000   Max.   : 5.309147   Max.   :27.76996  
##  mean_forest_fragmentation mean_edge_loss       mean_Elevation    
##  Min.   :     0.0          Min.   :-1114110.0   Min.   :   4.539  
##  1st Qu.:   423.5          1st Qu.:   -6420.0   1st Qu.: 185.203  
##  Median :  1117.0          Median :      60.0   Median : 448.087  
##  Mean   :  7545.1          Mean   :     692.8   Mean   :1055.654  
##  3rd Qu.:  2672.9          3rd Qu.:   10425.0   3rd Qu.:1903.913  
##  Max.   :529854.2          Max.   : 2194560.0   Max.   :4471.740  
##           geometry  
##  MULTIPOLYGON :574  
##  POLYGON      : 55  
##  epsg:3857    :  0  
##  +proj=merc...:  0  
##                     
## 
percent_wrong.all_incorrect.df %>% summary()
##       Code         percent_wrong median_deforested  max_forest_change  
##  Min.   :  10101   Min.   :1     Min.   :-1.80907   Min.   : 0.000000  
##  1st Qu.:  56162   1st Qu.:1     1st Qu.:-0.08929   1st Qu.: 0.000000  
##  Median : 120605   Median :1     Median : 0.00000   Median : 0.000543  
##  Mean   : 929534   Mean   :1     Mean   : 0.03270   Mean   : 0.693783  
##  3rd Qu.:1301782   3rd Qu.:1     3rd Qu.: 0.07411   3rd Qu.: 0.279505  
##  Max.   :5222302   Max.   :1     Max.   : 5.30915   Max.   :15.910696  
##  mean_forest_fragmentation mean_edge_loss    mean_Elevation    
##  Min.   :    0.0           Min.   :-547140   Min.   :   4.539  
##  1st Qu.:  226.5           1st Qu.:  -1929   1st Qu.: 404.332  
##  Median :  690.2           Median :    700   Median :2151.608  
##  Mean   : 1428.3           Mean   :  16070   Mean   :1851.612  
##  3rd Qu.: 1629.8           3rd Qu.:  14644   3rd Qu.:2926.715  
##  Max.   :13134.9           Max.   :2194560   Max.   :4471.740  
##           geometry  
##  MULTIPOLYGON :158  
##  POLYGON      :  9  
##  epsg:3857    :  0  
##  +proj=merc...:  0  
##                     
##